
global data "C:\Users\mlombardi\Dropbox\Bono Escuela - Peru\Data"


**********************************************
** CONSTRUCTION OF DATASET FOR MAIN RESULTS **
**********************************************
{

*1) Identifying BE groups
{
*Start off with the 8297 public schools that were eligible for ECE 
use "$data\secundaria_23022016.dta", clear
keep if grado=="SEGUNDO" & id_anio==2015
keep cod_mod anexo id_anio
duplicates drop

merge m:1 id_anio anexo cod_mod using "$data\IIEE_secundaria.dta" //Merge with dataset with characteristics of secondary schools (public/private and location)
keep if _m==3
drop if tip_gestion=="PRIVADA" // Drop private schools
drop _m

merge m:1 cod_mod anexo using "C:\Users\mlombardi\Dropbox\Bono Escuela - Peru\Dta files\BE_ECE_Schools.dta"  // File identifying schools that participated in ECE 
keep if _m==3
drop _merge

*Is it JEC?
merge m:1 cod_mod using "C:\Users\mlombardi\Dropbox\Bono Escuela - Peru\Other Data\JEC.dta"  // File identifying school day length in 2015. List of schools with full day (JEC) available in http://www.minedu.gob.pe/a/006.php
drop if _merge==2
replace jec=0 if _merge==1
drop _merge

egen school_id=group(cod_mod anexo)
gen rural=area=="Rural"


**JEC schools are grouped according to their DRE. If the size of the group is smaller than 5 schools, they are grouped with the closest DRE (geographically)
**non-JEC schools are grouped according to their UGEL and ambito (urban/rural). If the size of the groups is smaller than 5, they will only be grouped according to their UGEL

keep cod_mod anexo school_id rural jec cod_dre nom_dre cod_ugel nom_ugel
bysort cod_dre: gen f=_n==1
*26 groups of the first type
gen group_jec=sum(f)
replace group_jec=. if jec==0
drop f

bysort cod_ugel rural: gen f=_n==1
gen group_jes=sum(f)
replace group_jes=. if jec==1
replace group_jes=group_jes+26

gen group=group_jec if jec==1
replace group=group_jes if jec==0

gen aux=1
bysort group: egen num=sum(aux)

*There's 168 schools that are in groups with less than 5 schools. We have to create a gropu with their UGEL, instead of rural/urban
gen problem=num<5
tab problem if jec==1
tab problem if jec==0

*Non-JEC schools
gen aux2=group
replace aux2=. if jec==1
bysort cod_ugel: egen aux3=min(aux2)
replace group=aux3 if jec==0 & problem==1
drop aux2 aux3 problem num
bysort group: egen num=sum(aux)

*JEC schools
*Callao only has 2 schools, join it with Lima Metropolitana (closest DRE)
tab group if jec==1 & nom_dre=="DRE Lima Metropolitana"
replace group=15 if jec==1 & nom_dre=="DRE Callao"

egen group_be=group(group)

keep cod_mod anexo group_be
gen aux=1
bysort group_be: egen num_schools_group=sum(aux)
drop aux
save "$data\ECE_groups_BE_new.dta", replace
}


*2) Obtaining teacher characteristics
{
use "$data\docentesxseccion_stata12.dta" , clear

*Fix the ones that are shifted >> What appears as id_docente is actually numero de documento. Their id docente is missing. See if they have other observations from where I can take it. If not use numero_documento
foreach x in  turno numero_documento id_docente genero fecha_nacimiento{
gen `x'2=`x' if col14!=""
}

replace fecha_nacimiento=col14 if col14!=""
replace genero= fecha_nacimiento2 if col14!=""
replace id_docente=. if col14!=""
tostring id_docente2, replace
replace numero_documento=id_docente2 if col14!=""
replace turno=numero_documento2 if col14!=""

drop numero_documento2

gen space=" "
egen seccion_fix=concat(dsc_seccion space turno2) if  col14!=""
replace dsc_seccion=seccion_fix  if  col14!=""

gen problem1=col14!=""
drop col14- seccion_fix

*Fix the ones that have all of the information bunched in dsc_seccion
gen problem2=problem1==0 & id_docente==.

split dsc_seccion if problem2==1,     parse(|)  
replace dsc_seccion=dsc_seccion1 if problem2==1
replace turno=dsc_seccion2 if problem2==1
replace numero_documento=dsc_seccion3 if problem2==1
destring dsc_seccion4, replace
replace id_docente=dsc_seccion4 if problem2==1
replace genero=dsc_seccion8 if problem2==1
replace fecha_nacimiento=dsc_seccion9 if problem2==1

drop dsc_seccion1- dsc_seccion9

destring numero_documento, replace

*Try and find the id_docente from other observationsn for those who had a column shifted 
bysort numero_documento: egen id_aux=min(id_docente)
replace id_docente=id_aux if numero_documento!=. & id_docente==.
drop id_aux

*For the 10 teachers that still don't have an id_docente, becasue its the only observation with their numero de documento, give them a fake one starting from the largest id_docente number 
egen max=max(id_docente)
egen aux=group(numero_documento)
replace id_docente=aux+max if id_docente==.
drop max aux problem1 problem2

merge m:1 id_anio cod_mod anexo using "C:\Users\mlombardi\Dropbox\Bono Escuela - Peru\Raw Data\IIEE.dta"  
drop if _m==2
drop cod_dre- tip_gestion dsc_caracteristica- _merge

*Indicator for whether the teacher was working in the same school the previous year. Indicator for whether the teacher was teaching the same grade (in any school) in the previous yea
bysort  id_docente cod_mod anexo  id_anio: gen aux_2013=(id_anio==2013)
bysort  id_docente cod_mod anexo  id_anio: gen aux_2014=(id_anio==2014)
bysort  id_docente cod_mod anexo: egen was_in_2013=max(aux_2013)
bysort  id_docente cod_mod anexo : egen was_in_2014=max(aux_2014)
*Dummy =1 if the teacher-school obs is observed in the year before in the same school in any grade
gen new_to_school=0
replace new_to_school=1 if was_in_2013==0 & id_anio==2014
replace new_to_school=1 if was_in_2014==0 & id_anio==2015
replace new_to_school=. if  id_anio==2013

drop aux_* was_in*

bysort  id_docente  id_anio nivel dsc_grado : gen aux_2013=(id_anio==2013)
bysort  id_docente id_anio nivel dsc_grado: gen aux_2014=(id_anio==2014)
bysort  id_docente nivel dsc_grado: egen was_in_2013=max(aux_2013)
bysort  id_docente nivel dsc_grado: egen was_in_2014=max(aux_2014)
*Dummy =1 if the teacher-school obs is observed in the year before in the same grade
gen new_to_grade=0
replace new_to_grade=1 if was_in_2013==0 & id_anio==2014
replace new_to_grade=1 if was_in_2014==0 & id_anio==2015
replace new_to_grade=. if  id_anio==2013
drop aux_* was_in*

* Number of schools a teacher instructs in
bysort  id_anio id_docente cod_mod anexo: gen aux=(_n==1)
bysort  id_anio id_docente: egen num_schools=sum(aux)
label variable num_schools "Number of schools taught by teacher in that year"

drop nivel aux_2013 aux_2014 was_in_2013 was_in_2014 aux

*Age
gen birth_year=substr( fecha_nacimiento,1,4)
destring birth_year, replace
gen age=id_anio-birth_year
replace birth_year=1951 if birth_year==2195
replace birth_year=1962 if birth_year==2196
replace birth_year=1972 if birth_year==2197
replace birth_year=1977 if birth_year==7197
replace birth_year=1965 if birth_year==2965
replace birth_year=1968 if birth_year==2968 
replace birth_year=1973 if birth_year==2973
replace birth_year=1980 if birth_year==2980 
replace birth_year=1966 if birth_year==7966
replace birth_year=1969 if birth_year==7969
replace birth_year=1980 if birth_year==7980
replace birth_year=1981 if birth_year==7981
replace birth_year=1956 if birth_year==9956

replace age=. if age>100 | age<0
*50 obs with age>80, like birth year 1900, obviously typo
replace age=. if age>80 
drop birth_year

gen male=(genero=="M")


*
gen aux=1
bysort  id_anio id_docente cod_mod anexo: egen num_classes_school=sum(aux)
bysort  id_anio id_docente cod_mod anexo dsc_grado: egen num_classes_school_grade=sum(aux)

label variable num_classes_school "Number of classes taught by teacher in school/year" 
label variable num_classes_school_grade "Number of classes taught by teacher in school/grade/year" 


*
drop aux
bysort  id_anio cod_mod anexo dsc_grado id_docente: gen aux=(_n==1)
bysort  id_anio cod_mod anexo dsc_grado: egen num_teachers_grado=sum(aux)

label variable num_teachers_grado "Number of teachers in that school/grade/year"

*
gen grade="7mo" if dsc_grado=="PRIMERO"
replace grade="8vo" if dsc_grado=="SEGUNDO"
replace grade="9no" if dsc_grado=="TERCERO"
replace grade="10mo" if dsc_grado=="CUARTO"
replace grade="11vo" if dsc_grado=="QUINTO"

foreach x in 7mo 8vo 9no 10mo 11vo{
bysort  id_docente cod_mod anexo  id_anio: gen aux_`x'=(grade=="`x'")
bysort  id_docente cod_mod anexo id_anio: egen taught_`x'=max(aux_`x')
}

drop aux*

*Aggregate by school/grade/year
keep id_anio cod_mod anexo dsc_grado id_docente new_to_school new_to_grade num_schools age male num_classes_school num_classes_school_grade num_teachers_grado taught_7mo taught_8vo taught_9no taught_10mo taught_11vo
duplicates drop


bysort  id_anio cod_mod anexo dsc_grado: egen avg_age_grade=mean(age)
bysort  id_anio  cod_mod anexo dsc_grado: egen perc_male_grade =mean(male)

*In each grade, what share of teachers also instructs these other grades
foreach x in 7mo 8vo 9no 10mo 11vo{
bysort  id_anio cod_mod anexo dsc_grado: egen perc_teaches_`x'=mean(taught_`x')
}

bysort  id_anio cod_mod anexo dsc_grado: egen perc_new_to_school=mean(new_to_school)
bysort  id_anio cod_mod anexo dsc_grado: egen perc_new_to_grade=mean(new_to_grade)
replace perc_new_to_school=. if id_anio==2013
replace perc_new_to_grade=. if id_anio==2013

bysort  id_anio cod_mod anexo dsc_grado: egen avg_num_schools=mean(num_schools)
bysort  id_anio cod_mod anexo dsc_grado: egen avg_num_classes_school_grade=mean(num_classes_school_grade)


keep id_anio cod_mod anexo dsc_grado num_teachers_grado avg_age_grade perc_male_grade perc_teaches_7mo- perc_teaches_11vo perc_new_to_school perc_new_to_grade avg_num_schools avg_num_classes_school_grade
keep if dsc_grado=="SEGUNDO" | dsc_grado=="TERCERO"
duplicates drop

rename dsc_grado grado

save "$data\Teacher_Grade_Characteristics.dta", replace

}



*3) Obtaining student characteristics, and merging with school and teacher characteristics
{
use "$data\secundaria_23022016.dta", clear
keep if grado=="TERCERO" | grado=="SEGUNDO" | grado=="PRIMERO" // Keep 7th, 8th and 9th grade students (7th graders to get the lagged grades)

*There's 70 obs in 2014 that are shifted, adjust them
gen space=" "
egen seccion2=concat(seccion space turno) if  col43!=""
replace seccion=seccion2  if  col43!=""
drop seccion2 space
tostring  arte- pfrrhh, replace
replace turno=arte if col43!="" 
replace arte=cta if col43!=""
replace cta=comu  if col43!=""
replace comu=efis if col43!=""
replace efis=etra  if col43!=""
replace etra=erel if col43!=""
replace erel=fcc  if col43!=""
replace fcc=hge  if col43!=""
replace hge=ingl  if col43!=""
replace ingl=mate  if col43!=""
replace mate=pfrrhh if col43!=""
replace pfrrhh=sit_final if col43!=""
destring  arte- pfrrhh, replace
replace sit_final=mot_ret	 if col43!=""
replace mot_ret=fecha_nacimiento if col43!=""
replace fecha_nacimiento=sexo if col43!=""
replace sexo=id_pais if col43!=""
replace id_pais=lugar_nacimiento   if col43!=""
replace lugar_nacimiento=dpto_prov_dist   if col43!=""
replace dpto_prov_dist=lengua_materna   if col43!=""
replace lengua_materna=segunda_lengua   if col43!=""
replace segunda_lengua=tipo_discapacidad   if col43!=""
replace tipo_discapacidad=instruccion_madre  if col43!=""
replace instruccion_madre=lugar_residencia  if col43!=""
tostring cod_mod_traslado, replace
replace lugar_residencia=cod_mod_traslado   if col43!=""
replace  cod_mod_traslado=situacion_matricula  if col43!=""
destring cod_mod_traslado, replace
replace  situacion_matricula=trabaja  if col43!=""
replace trabaja=horas_semanales_trabajo   if col43!=""
replace horas_semanales_trabajo=padre_vive    if col43!=""
replace  padre_vive=madre_vive   if col43!=""
replace  madre_vive=fecha_nacimiento_padre   if col43!=""
replace  fecha_nacimiento_padre=nivel_instruccion_padre  if col43!=""
replace  nivel_instruccion_padre=vive_con_estudiante_padre   if col43!=""
replace  vive_con_estudiante_padre=fecha_nacimiento_madre   if col43!=""
replace  fecha_nacimiento_madre= nivel_instruccion_madre    if col43!=""
replace  nivel_instruccion_madre=vive_con_estudiante_madre   if col43!=""
replace  vive_con_estudiante_madre=col43   if col43!=""
drop col43

destring  arte- pfrrhh, replace
destring fcc, replace force

*Merge with dataset with characteristics of secondary schools (public/private and location)
merge m:1 id_anio anexo cod_mod using "$data\IIEE_secundaria.dta"

keep if _m==3
drop _merge

*Drop some variables that we will not use later on and that vary within student-year-school but shouldn't, leading to duplicate observations
drop   dsc_caracteristica   lugar_nacimiento dpto_prov_dist segunda_lengua trabaja  horas_semanales_trabajo fecha_nacimiento_padre fecha_nacimiento_madre nom_dre nom_ugel ///
departamento provincia distrito cen_pob fecha_inicio_anio fecha_fin_anio cod_dre dsc_modalidad lugar_residencia nivel   
*Remove duplicate observations
duplicates drop

drop if id_anio==2015 & (grado=="PRIMERO" | tip_gestion=="PRIVADA") // Drop private school students in 2015 (not part of our sample, and not necessary for lagged grades)

*** Deal with students with more than one observation in a given year
{
gen aux=1
bysort id_persona id_anio: egen num_obs=sum(aux)

*Do they have grades in all observations?
gen has_grades=(mate!=. | comu!=.)
bysort id_persona id_anio: egen num_has_grades=sum(has_grades)

*If they have only one observation with grades, the other ones usually correspond to schools they transferred from after the school year started. Keep the only observation with grades
drop if has_grades==0 & num_has_grades==1 & num_obs>1 & sit_final=="Trasladado"

drop num_obs
bysort id_persona id_anio: egen num_obs=sum(aux)
*Very few left now (0.09%). Mostly kids who have grades in more than one observation. Drop observations where the kid was transferred, for kids who have at least another observation with grades in which they were not transferred
gen transferred=sit_final=="Trasladado"
*Do the non-transferred obs have grades?
gen aux2=transferred==0 & has_grades==1
bysort id_persona id_anio: egen num_grades_nontrans=sum(aux2)
drop if num_obs>1 & transferred==1 & num_grades_nontrans>0
drop transferred num_grades_nontrans num_obs aux2 num_has_grades

bysort id_persona id_anio: egen num_obs=sum(aux)
bysort id_persona id_anio: egen num_has_grades=sum(has_grades)

*Very few left now (0.06%). It's around 2204 kids, most have grades in 2 observations. How many of them are in the same school?
drop aux
bysort id_persona id_anio cod_mod anexo: gen aux=(_n==1) 
bysort id_persona id_anio: egen num_schools=sum(aux)
*79% of kids with more than one obs are in the same school in all obs. For those who have more than one obs in the same school, it's just a covariate that differs >> I keep the first
tab num_schools if num_obs>1
gsort id_anio id_persona -has_grades
by id_anio id_persona: gen first_person=(_n==1)
drop if first_person==0 & num_schools==1 & num_obs>1

*There's also a few cases of kids with 2 schools, but 2 observations in one of the schools where just a covariate differs. In the duplicate school, keep one
drop aux first_person
gen aux=1
bysort id_persona id_anio cod_mod anexo: egen num_same_school=sum(aux)
gsort id_anio id_persona cod_mod  anexo -has_grades
by id_anio id_persona cod_mod anexo: gen first_person=(_n==1)
drop if first_person==0 & num_same_school==2

drop first_person aux num_obs num_same_school
gen aux=1
bysort id_persona id_anio: egen num_obs=sum(aux)

*Now it's 466 students with 2 observations, all of which have data in two schools
*Note: cod_mod_translado is the code of the school the student came from (the origin school). So if a student has grades in two different schools, but was transfered from one of them to another, keep the destination school
gen aux2=cod_mod_traslado!=.
bysort id_anio id_persona: egen was_transferred=max(aux2)
bysort id_anio id_persona: egen num_transfers=sum(aux2)
*Drop the origin school
drop if num_obs>1 & num_transfers==1 & cod_mod_traslado==.

*If they have two transfers, one with grades and another without, keep the one with grades
drop num_has_grades
bysort id_persona id_anio: egen num_has_grades=sum(has_grades)
drop if num_obs>1 & num_transfers==2 & has_grades==0 & num_has_grades==1 

drop num_transfers num_obs 
bysort id_persona id_anio: egen num_obs=sum(aux)
bysort id_anio id_persona: egen num_transfers=sum(aux2)
*Only 154 students with more than one observation. It's 0.00% of the students. Drop one of them
bysort id_persona id_anio: gen first_person=_n==1
drop if first_person==0 & num_obs>1
drop num_schools- first_person
*Drop the student who passed away and therefore has no grades
drop if sit_final=="Fallecidos" & has_grades==0
}

gen dropped_out=has_grades==0
drop has_grades

*Get grade of previous year
foreach x in mate comu{
gen aux_`x'_14=`x' if id_anio==2014
gen aux_`x'_13=`x' if id_anio==2013
}

foreach x in mate comu {
bysort id_persona: egen `x'_14=max(aux_`x'_14)
bysort id_persona: egen `x'_13=max(aux_`x'_13)
}

drop  aux_mate_14- aux_comu_13

foreach x in mate comu {
gen previous_`x'=`x'_14 if id_anio==2015
replace previous_`x'=`x'_13 if id_anio==2014
}
drop   mate_14- comu_13


** Keep public school students from 8th and 9th
drop if tip_gestion=="PRIVADA" |  grado=="PRIMERO"


*Individual Controls
gen male=(sexo=="HOMBRE")
replace male=. if sexo==""
gen foreigner=(id_pais!="PE")
replace foreigner=. if id_pais==""
gen lengua_materna_cast=(lengua_materna=="CASTELLANO") 
replace lengua_materna_cast=. if lengua_materna==""
gen discapacidad=(tipo_discapacidad!="")
gen mother_low_educ=(nivel_instruccion_madre=="NINGUNO" | nivel_instruccion_madre=="PRIM.COMP" | nivel_instruccion_madre=="PRIM.INCOM")
replace mother_low_educ=. if  nivel_instruccion_madre==""
gen father_low_educ=(nivel_instruccion_padre=="NINGUNO" | nivel_instruccion_padre=="PRIM.COMP" | nivel_instruccion_padre=="PRIM.INCOM")
replace father_low_educ=. if nivel_instruccion_padre==""
gen father_alive=(padre_vive=="SI" | padre_vive=="")
gen mother_alive=(madre_vive=="SI" | madre_vive=="")
gen lives_father=(vive_con_estudiante_padre=="SI")
gen lives_mother=(vive_con_estudiante_madre=="SI")
gen repetidor=(situacion_matricula=="REPITE")
drop  sexo id_pais lengua_materna tipo_discapacidad  padre_vive madre_vive vive_con_estudiante_padre  vive_con_estudiante_madre  nivel_instruccion_madre  nivel_instruccion_padre

keep id_persona id_anio cod_mod anexo tip_gestion grado seccion turno arte- sit_final fecha_nacimiento cod_ugel dropped_out previous_mate previous_comu male foreigner lengua_materna_cast discapacidad mother_low_educ father_low_educ father_alive mother_alive lives_father lives_mother repetidor


merge m:1 cod_mod anexo using "$data\BE_ECE_Schools.dta" // File identifying schools that participated in ECE (our sample excludes schools that did not take the ECE)
keep if _m==3
drop _merge

*Only keep schools that have both grades in all 3 years
bysort id_anio cod_mod anexo grado: gen aux=_n==1
bysort cod_mod anexo: egen num_grades_years=sum(aux)
keep if num_grades_years==6
drop aux num_grades_years


*How many students in each classroom?
*How many classroom?
*How many students in each grade?
gen aux=1
bysort id_anio cod_mod anexo grado: egen num_students_school_grado=sum(aux)
bysort id_anio cod_mod anexo grado  turno  seccion: egen num_students_seccion=sum(aux)
bysort id_anio cod_mod anexo grado  turno  seccion: gen first_seccion=(_n==1)
bysort id_anio cod_mod anexo grado: egen num_secciones_grado=sum(first_seccion)
drop first_seccion aux

gen rural=area=="Rural"
drop area

gen teacher_pupil_ratio_grade= num_students_school_grado/ num_secciones_grado

gen std_mate=.
gen std_comu=.

*Standardize grades (z-score) by year 
foreach x in mate comu{
foreach z in 2013 2014 2015{
qui sum `x' if   id_anio==`z'
replace std_`x'=(`x'-r(mean))/r(sd)  if  id_anio==`z'
}	
}

label variable std_mate "Math Grade (z-score)"
label variable std_comu "Language Grade (z-score)"


foreach x in arte cta etra efis erel fcc hge ingl pfrrhh {
gen std_`x'=.
foreach z in 2013 2014 2015{
qui sum `x' if   id_anio==`z'
replace std_`x'=(`x'-r(mean))/r(sd)  if  id_anio==`z'
}	
}

egen nonincent=rowmean(arte cta etra efis erel fcc hge ingl pfrrhh)
egen std_nonincent=rowmean(std_arte std_cta std_etra std_efis std_erel std_fcc std_hge std_ingl std_pfrrhh)
label variable std_nonincent "Non-Incentivized Course Grades (z-score)"

gen treat=(grado=="SEGUNDO")
gen post=(id_anio==2015)
gen treat_post=treat*post
gen year_2014=(id_anio==2014) // For testing for parallel trends
gen treat_year_2014=treat*year_2014

egen school_fe=group(cod_mod anexo)

** BE Group identifier 
merge m:1 cod_mod anexo using  "$data\ECE_groups_BE_new.dta"
drop if _merge==2
drop _m

*Teacher characteristics
merge m:1 id_anio cod_mod anexo grado using "$data\Teacher_Grade_Characteristics.dta"

*In 1.3% of the year-school-grade obs I dont have teacher data
gen no_teacher_data=(_merge==1)


*Does a primary school operate in the same building? Did it participate in ECE in 2013 or 2014? 
merge m:1 cod_mod anexo using "$data\Primary_Secondary.dta"
drop if _m==2
drop _merge
replace  has_primaria_ece=0 if  has_primaria_ece==.
replace  has_primaria_ganador_ece=0 if  has_primaria_ganador_ece==.

*Dummy for passing the subject
foreach x in mate comu arte cta etra efis erel fcc hge ingl pfrrhh {
gen aprobo_`x'=`x'>=11
replace aprobo_`x'=. if `x'==.
}
egen aprobo_nonincent=rowmean(aprobo_arte aprobo_cta aprobo_etra aprobo_efis aprobo_erel aprobo_fcc aprobo_hge aprobo_ingl aprobo_pfrrhh)
drop aprobo_arte aprobo_cta aprobo_etra aprobo_efis aprobo_erel aprobo_fcc aprobo_hge aprobo_ingl aprobo_pfrrhh

*Teacher teaches both grades
bysort id_anio cod_mod anexo grado: gen first_grado=(_n==1)
gen perc_both=perc_teaches_9no if grado=="SEGUNDO"
replace perc_both=perc_teaches_8vo if grado=="TERCERO"


label variable mate "Math"
label variable comu "Language"
label variable nonincent "Other courses - average"
label variable aprobo_mate "Math"
label variable aprobo_comu "Language"
label variable aprobo_nonincent "Other courses - average"
label variable repetidor "Repeated last year"   
label variable male "Male"
label variable foreigner "Foreigner"
label variable lengua_materna_cast "Spanish is native tongue"
label variable discapacidad "Has a disability"
label variable father_alive "Father is alive"
label variable mother_alive  "Mother is alive"
label variable  lives_father "Father lives in HH"
label variable  lives_mother "Mother lives in HH"

label variable rural "Rural"
label variable num_secciones_grado "Number of classes"
label variable teacher_pupil_ratio_grade "Teacher-pupil ratio"
label variable num_teachers_grado "Number of teachers" 
label variable avg_age_grade "Average age of teachers"
label variable perc_male_grade "Share of male teachers"
label variable perc_both "Share of teachers instructing the other grade"

label variable treat "8th Grade"
label variable treat_post "8th Grade x Post"
label variable treat_year_2014 "8th Grade x 2014"

save "$data\internal_grades_2013_2015.dta", replace
}

}


************************
** TABLES AND FIGURES **
************************

use "$data\internal_grades_2013_2015.dta", clear

global controls  repetidor male foreigner lengua_materna_cast discapacidad father_alive mother_alive lives_father lives_mother



**************
** TABLE 2 **
*************

foreach x in mate comu nonincent {
areg std_`x' treat_post treat_year_2014 treat i.id_anio ${controls} , absorb(school_fe) cluster(school_fe)
eststo parallel_`x'
}

esttab parallel_mate parallel_comu parallel_nonincent using "Table2.tex", replace b(3) se(3)  star(* 0.10 ** 0.05 *** 0.01) nonotes l mtitles("Math" "Language" "Non-Incentivized Courses") ///
scalars("N Observations" "r2 R$^2$") sfmt(%12.0fc %12.3fc)   keep(treat_post treat_post2 treat) 


**************
** TABLE 3  **
**************

*Student Characteristics
egen num_miss=rowmiss(mate comu repetidor male foreigner lengua_materna_cast discapacidad  father_alive mother_alive lives_father lives_mother)

estpost sum  mate comu nonincent aprobo_mate aprobo_comu aprobo_nonincent repetidor male foreigner lengua_materna_cast discapacidad  father_alive mother_alive lives_father lives_mother if treat==0 & num_miss==0
eststo stats_indiv_0

estpost sum  mate comu nonincent aprobo_mate aprobo_comu aprobo_nonincent male repetidor  foreigner lengua_materna_cast discapacidad  father_alive mother_alive lives_father lives_mother if treat==1 & num_miss==0
eststo stats_indiv_1


*Grade Characteristics
estpost sum   rural  num_secciones_grado  teacher_pupil_ratio_grade   num_teachers_grado perc_both avg_age_grade  perc_male_grade if treat==0 & first_grado==1
est store stats_grade_0

estpost sum   rural  num_secciones_grado  teacher_pupil_ratio_grade  num_teachers_grado perc_both avg_age_grade  perc_male_grade if treat==1  & first_grado==1
est store stats_grade_1


esttab stats_indiv_1 stats_indiv_0 using "Table3_a.tex",  replace refcat(mate "\textbf{\textit{Final Grade (0-20)}}" aprobo_mate "\textbf{\textit{Passed the Course}}" male "\textbf{\textit{Other Individual Characteristics}}") cells(" mean(fmt(2)) sd(fmt(2))") noobs collabels("Mean" "Std. Dev")  label nonum
esttab stats_grade_1 stats_grade_0 using "Table3_b.tex",  replace refcat(rural "\textbf{\textit{Grade/School Characteristics}}") cells("mean(fmt(2)) sd(fmt(2))") noobs collabels( "Mean" "Std. Dev")  label nonum


***************************
** TABLES 4, 5, and A.3  **
***************************

foreach x in mate comu nonincent {

reg std_`x' treat_post treat i.id_anio,  cluster(school_fe)
eststo `x'_2
estadd local yearfe "$\checkmark$"
estadd local schoolfe ""
estadd local individ ""

areg std_`x' treat_post treat i.id_anio, absorb(school_fe) cluster(school_fe)
eststo `x'_3
estadd local yearfe "$\checkmark$"
estadd local schoolfe "$\checkmark$"
estadd local individ ""

areg std_`x' treat_post treat i.id_anio ${controls}, absorb(school_fe) cluster(school_fe)
eststo `x'_5
estadd local yearfe "$\checkmark$ "
estadd local schoolfe "$\checkmark$"
estadd local individ "$\checkmark$"
}

esttab mate_2 mate_3 mate_5 comu_2 comu_3 comu_5  using "Table4.tex", replace b(3) se(3) star(* 0.10 ** 0.05 *** 0.01)  ///
nonotes l mgroups("Math" "Language" , pattern(1 0 0 1 0 0 ) prefix(\multicolumn{@span}{c}{) suffix(}) span erepeat(\cmidrule(lr){@span})) ///
scalars("N Observations" "r2 R$^2$ " "yearfe Year FE" "schoolfe School FE" "individ Individual Controls") sfmt(%12.0fc %12.3fc %#s %#s %#s)   keep(treat_post treat repetidor male foreigner lengua_materna_cast discapacidad  father_alive mother_alive lives_father lives_mother) 

esttab  nonincent_2 nonincent_3 nonincent_5 using "Table5.tex", replace b(3) se(3) star(* 0.10 ** 0.05 *** 0.01)  nonotes l mgroups("Non-Incentivized Courses", pattern( 1 0 0) prefix(\multicolumn{@span}{c}{) suffix(}) span erepeat(\cmidrule(lr){@span})) ///
scalars("N Observations" "r2 R$^2$ " "yearfe Year FE" "schoolfe School FE" "individ Individual Controls") sfmt(%12.0fc %12.3fc %#s %#s %#s)   keep(treat_post treat repetidor male foreigner lengua_materna_cast discapacidad  father_alive mother_alive lives_father lives_mother) 

* All non incentivized subjects (Appendix Table)
foreach x in arte cta etra efis erel fcc hge ingl pfrrhh  {
areg std_`x' treat_post treat i.id_anio ${controls} , absorb(school_fe) cluster(school_fe)
eststo `x'
}

esttab arte cta hge ingl fcc pfrrhh efis erel etra using "TableA.3.tex", replace b(3) se(3) star(* 0.10 ** 0.05 *** 0.01)  nonotes l ///
scalars("N Observations" "r2 R$^2$" "yearfe Year FE" "schoolfe School FE" "individ Individual Controls") sfmt(%12.0fc %12.3fc %#s %#s %#s)   keep(treat_post treat repetidor male foreigner lengua_materna_cast discapacidad  father_alive mother_alive lives_father lives_mother) 

*********************
** TABLE 6 AND A.5 **
*********************

*Build covariates for subgroup analysis
gen parents_low_educ=father_low_educ+mother_low_educ
recode parents_low_educ (0 = 2) (1 = 1)  (2 = 0), gen(parents_high_educ)

gen std_previous_mate=.
gen std_previous_comu=.

*Previous grade - standardized by school-year
foreach x in mate comu {
bysort id_anio school_fe treat: egen mean_previous_`x'=mean(previous_`x')
bysort id_anio school_fe treat: egen sd_previous_`x'=sd(previous_`x')
replace std_previous_`x'=(previous_`x'-mean_previous_`x')/sd_previous_`x'
drop mean_previous_`x' sd_previous_`x'
}

* TABLE 6
gen treat_post_interaction=.
gen treat_interaction=.
gen post_interaction=.
gen covariate=.

foreach y in male lengua_materna_cast  parents_high_educ  repetidor    rural  {

replace treat_post_interaction=treat_post*`y'
replace treat_interaction=treat*`y'
replace post_interaction=post*`y'
replace covariate=`y'

foreach x in mate comu {
qui areg std_`x' treat_post treat_post_interaction treat treat_interaction post post_interaction  covariate i.id_anio ${controls} , absorb(school_fe) cluster(school_fe)
eststo `x'_`y'
}
}

foreach y in mate comu {
replace treat_post_interaction=treat_post*std_previous_`y'
replace treat_interaction=treat*std_previous_`y'
replace post_interaction=post*std_previous_`y'
replace covariate=std_previous_`y'
areg std_`y' treat_post treat_post_interaction treat treat_interaction post post_interaction  covariate i.id_anio ${controls}  , absorb(school_fe) cluster(school_fe)
eststo previous_`y'
test treat_post +treat_post_interaction=0
}


label variable treat_post_interaction "8th Grade x Post x Covariate"
label variable covariate "Covariate"

esttab mate_male mate_lengua_materna_cast mate_parents_high_educ mate_repetidor  previous_mate mate_rural using "Table6_mate.tex", ///
replace b(3) se(3) star(* 0.10 ** 0.05 *** 0.01) nonotes l scalars("N Observations" "r2 R$^2$") sfmt(%15.0fc %12.3fc ) mtitles("Male" "Spanish Speaker"  "Parents' Education" "Retained"  "Lagged Grade"  "Rural") keep(treat_post treat_post_interaction) 

esttab comu_male comu_lengua_materna_cast comu_parents_high_educ comu_repetidor  previous_comu  comu_rural using "Table6_comu.tex", ///
replace b(3) se(3) star(* 0.10 ** 0.05 *** 0.01) nonotes l scalars("N Observations" "r2 R$^2$") sfmt(%15.0fc %12.3fc ) mtitles("Male" "Spanish Speaker"  "Parents' Education" "Retained"  "Lagged Grade" "Rural") keep(treat_post treat_post_interaction) 


* TABLE A.5
foreach x in comu mate{
sum std_previous_`x', detail
gen quartile_`x'_1=std_previous_`x'<r(p25)
gen quartile_`x'_2=std_previous_`x'>=r(p25) & std_previous_`x'<r(p50)
gen quartile_`x'_3=std_previous_`x'>=r(p50) & std_previous_`x'<r(p75)
gen quartile_`x'_4=std_previous_`x'>=r(p75)
replace quartile_`x'_1=. if std_previous_`x'==.
replace quartile_`x'_2=. if std_previous_`x'==.
replace quartile_`x'_3=. if std_previous_`x'==.
replace quartile_`x'_4=. if std_previous_`x'==.
}


forvalues x=1/4{
foreach y in mate comu{
gen treat_post_`y'_`x'=treat_post*quartile_`y'_`x'
gen treat_`y'_`x'=treat*quartile_`y'_`x'
gen post_`y'_`x'=post*quartile_`y'_`x'
}
}

foreach x in mate comu{
areg std_`x' treat_post treat_post_`x'_2 treat_post_`x'_3 treat_post_`x'_4 treat treat_`x'_2 treat_`x'_3 treat_`x'_4  post post_`x'_2 post_`x'_3 post_`x'_4 quartile_`x'_1 quartile_`x'_2 quartile_`x'_3 quartile_`x'_4  i.id_anio ${controls} if id_anio!=2013  , absorb(school_fe) cluster(school_fe)
eststo `x'_quartiles
test treat_post+treat_post_`x'_2=0
estadd scalar pvalue1= r(p) 
test treat_post+treat_post_`x'_3=0
estadd scalar pvalue2= r(p) 
test treat_post+treat_post_`x'_4=0
estadd scalar pvalue3= r(p) 

}

foreach x in mate comu{
label variable treat_post_`x'_2 "8th Grade x Post x Q2"
label variable treat_post_`x'_3 "8th Grade x Post x Q3"
label variable treat_post_`x'_4 "8th Grade x Post x Q4"
}

esttab mate_quartiles comu_quartiles using "TableA.5_quartiles.tex", replace b(3) se(3) star(* 0.10 ** 0.05 *** 0.01) nonotes l ///
 scalars("N Observations" "r2 R$^2$" "pvalue1 P-value (sum of coefficients Q2)" "pvalue2 P-value (sum of coefficients Q3)" "pvalue3 P-value (sum of coefficients Q4)") ///
 sfmt(%12.0fc %12.3fc %12.3fc %12.3fc %12.3fc) mtitles("Math" "Language") keep(treat_post treat_post_mate_2 treat_post_mate_3 treat_post_mate_4 treat_post treat_post_comu_2 treat_post_comu_3 treat_post_comu_4) 
drop quartile_comu_1- post_comu_4



foreach x in comu mate{
_pctile std_previous_`x' , p(33 66)
gen tercile_1_`x'=std_previous_`x'<r(r1)
gen tercile_2_`x'=std_previous_`x'>=r(r1) & std_previous_`x'<r(r2)
gen tercile_3_`x'=std_previous_`x'>=r(r2)
replace tercile_1_`x'=. if std_previous_`x'==.
replace tercile_2_`x'=. if std_previous_`x'==.
replace tercile_3_`x'=. if std_previous_`x'==.
}

forvalues y=1/3{
foreach x in mate comu{
gen treat_post_`y'_`x'=treat_post*tercile_`y'_`x'
gen treat_`y'_`x'=treat*tercile_`y'_`x'
gen post_`y'_`x'=post*tercile_`y'_`x'
}
}

foreach x in mate comu{
areg std_`x' treat_post treat_post_2_`x' treat_post_3_`x'  treat treat_2_`x' treat_3_`x'  post post_2_`x' post_3_`x'  tercile_1_`x' tercile_2_`x' tercile_3_`x'   i.id_anio ${controls} if id_anio!=2013  , absorb(school_fe) cluster(school_fe)
eststo `x'_terciles
test treat_post+treat_post_2_`x'=0
estadd scalar pvalue1= r(p) 
test treat_post+treat_post_3_`x'=0
estadd scalar pvalue2= r(p) 

}

foreach x in mate comu{
label variable treat_post_2_`x' "8th Grade x Post x T2"
label variable treat_post_3_`x' "8th Grade x Post x T3"
}

esttab  mate_terciles comu_terciles using "TableA.5_terciles.tex", replace b(3) se(3) star(* 0.10 ** 0.05 *** 0.01) nonotes l ///
scalars("N Observations" "r2 R$^2$" "pvalue1 P-value (sum of coefficients T2)" "pvalue2 P-value (sum of coefficients T3)") ///
sfmt(%12.0fc %12.3fc %12.3fc %12.3fc) mtitles("Math" "Language") keep(treat_post treat_post_2_mate treat_post_3_mate treat_post_2_comu treat_post_3_comu) 


**************
** TABLE 7  **
**************

bysort id_anio cod_mod anexo grado: gen f=_n==1
rename avg_num_classes_school_grade avg_num_classes

foreach x in avg_age_grade perc_male_grade avg_num_classes avg_num_schools perc_new_to_school  perc_new_to_grade{
areg `x' treat_post treat i.id_anio if f==1, absorb(school_fe) cluster(school_fe)
eststo `x'
sum `x' if f==1
estadd scalar mean_dep=r(mean)


}

esttab avg_age_grade perc_male_grade avg_num_classes avg_num_schools perc_new_to_school  perc_new_to_grade using "Table7.tex", ///
replace b(3) se(3) star(* 0.10 ** 0.05 *** 0.01)  nonotes l mtitles ("Avg. Age" "Share Male" "Average Number of Classes" "Number of Schools" "Share New to School" "Share New to Grade" ) ///
scalars("N Observations" "r2 R$^2$" "mean_dep Dependent Variable Mean") sfmt(%12.0fc %12.3fc)   keep(treat_post treat) 



*************
** TABLE 8 **
*************

gen aux=perc_teaches_9no
replace aux=. if grado=="TERCERO"
replace aux=. if id_anio!=2015

bysort cod_mod anexo: egen avg_overlap=mean(aux)
bysort cod_mod anexo: gen f=_n==1

_pctile avg_overlap if f==1, percentiles(10 20 30 40 50 60 70 80)
gen low_overlap=avg_overlap<r(r1)
gen med_overlap=avg_overlap>=r(r1) & avg_overlap<r(r3)
gen high_overlap=avg_overlap>=r(r3) 

replace low_overlap=. if avg_overlap==.
replace med_overlap=. if avg_overlap==.
replace high_overlap=. if avg_overlap==.

sum avg_overlap if low_overlap==1 & f==1
sum avg_overlap if med_overlap==1 & f==1
sum avg_overlap if high_overlap==1 & f==1


foreach x in low med high{
foreach y in mate comu  {
areg std_`y' treat_post treat post i.id_anio ${controls} if  `x'_overlap==1 , absorb(school_fe) cluster(school_fe)
eststo `x'_overlap_`y'

}
}

label variable treat "8th Grade"
label variable treat_post "8th Grade x Post"

esttab low_overlap_mate med_overlap_mate high_overlap_mate low_overlap_comu med_overlap_comu high_overlap_comu   using "Table8.tex", replace b(3) se(3) star(* 0.10 ** 0.05 *** 0.01)  nonotes l mgroups("Math" "Language" , pattern(1 0 0 1 0 0 ) prefix(\multicolumn{@span}{c}{) suffix(}) span erepeat(\cmidrule(lr){@span})) ///
scalars("N Observations" "r2 R$^2$") sfmt(%12.0fc %12.3fc )   keep(treat_post treat repetidor male foreigner lengua_materna_cast discapacidad  father_alive mother_alive lives_father lives_mother) 




*************
** TABLE 9 **
*************

*Average salary (calculated using the categories of teachers from the Censo escolar)
merge m:1 cod_mod anexo using "$data\perc_contratados.dta" // Perc of teachers with temporary positions
drop if _m==2

gen average_salary=( 1554.90* perc_nombrado_sinesc + 1243.92*perc_contratado + 1554.90* perc_nombrado_esc1 + 1710.39* perc_nombrado_esc2  + 1710.39* perc_nombrado_esc3 + 2176.86* perc_nombrado_esc4 + 2643.33*perc_nombrado_esc5 + 3109.80*perc_nombrado_esc6+ 4042.74* perc_nombrado_esc8)
gen ln_salary=ln(average_salary)
gen treat_post_interaction=treat_post*ln_salary
gen treat_interaction=treat*ln_salary
gen post_interaction=post*ln_salary

foreach y in mate comu {
areg std_`y' treat_post treat_post_interaction treat treat_interaction post post_interaction i.id_anio ${controls}  , absorb(school_fe) cluster(school_fe)
eststo `y'_contratado
test treat_post+treat_post_interaction==0
estadd scalar pvalue= r(p) 

}

*By whether only one class in 8th grade
gen one_section=(num_secciones_grado==1)
gen treat_post_one_section=treat_post*one_section
gen treat_one_section=treat*one_section
gen post_one_section=post*one_section

foreach x in mate comu {
qui areg std_`x' treat_post treat_post_one_section treat treat_one_section post_one_section one_section  i.id_anio ${controls}  if id_anio!=2012 , absorb(school_fe) cluster(school_fe)
eststo one_section_`x'
test treat_post+treat_post_one_section==0
estadd scalar pvalue= r(p) 

}


*By the number of classes per grade
rename num_secciones_grado  num_classes
gen treat_post_num_classes=treat_post*num_classes
gen treat_num_classes =treat*num_classes
gen post_num_classes=post*num_classes

foreach x in mate comu {
qui areg std_`x' treat_post treat_post_num_classes treat treat_num_classes post_num_classes num_classes  i.id_anio ${controls}  if id_anio!=2012 , absorb(school_fe) cluster(school_fe)
eststo num_classes_`x'
test treat_post+treat_post_num_classes==0
estadd scalar pvalue= r(p) 

}


*By whether school has a primary/primary winner
gen treat_post_primaria=treat_post*has_primaria_ece
gen treat_primaria=treat*has_primaria_ece
gen post_primaria=post*has_primaria_ece
gen treat_post_primaria_won=treat_post* has_primaria_ganador_ece
gen treat_primaria_won=treat*has_primaria_ganador_ece
gen post_primaria_won=post*has_primaria_ganador_ece

sum has_primaria_ece has_primaria_ganador_ece if f==1


foreach y in mate comu {
areg std_`y' treat_post treat_post_primaria treat treat_primaria post post_primaria i.id_anio ${controls}  , absorb(school_fe) cluster(school_fe)
eststo primaria_`y'
test treat_post +treat_post_primaria=0
estadd scalar pvalue= r(p) 

areg std_`y' treat_post treat_post_primaria_won treat treat_primaria_won post post_primaria_won i.id_anio ${controls}  , absorb(school_fe) cluster(school_fe)
eststo primaria_won_`y'
test treat_post +treat_post_primaria_won=0
estadd scalar pvalue= r(p) 

}

label variable treat_post_one_section "8th Grade x Post x One Group"
label variable treat_post_num_classes "8th Grade x Post x Number of Classes"
label variable  treat_post_primaria "8th Grade x Post x BE Primary"
label variable treat_post_primaria_won "8th Grade x Post x BE Primary Winner"
label variable treat_post "8th Grade x Post"
label variable treat_post_interaction "$8th Grade x Post x Ln (Average Salary)"


esttab primaria_mate primaria_won_mate mate_contratado num_classes_mate one_section_mate primaria_comu primaria_won_comu comu_contratado num_classes_comu one_section_comu   using "Table9tex", ///
replace b(3) se(3) star(* 0.10 ** 0.05 *** 0.01) nonotes l scalars("N Observations"  "r2 R$^2$" "pvalue P-Value (sum of both coefficients = 0)") sfmt(%12.0fc %12.3fc %12.3fc ) mgroups("Math" "Language" , pattern(1 0 0 0  0 1 0 0 0 0) prefix(\multicolumn{@span}{c}{) suffix(}) span erepeat(\cmidrule(lr){@span})) ///
keep(treat_post treat_post_interaction treat_post_one_section treat_post_num_classes  treat_post_primaria treat_post_primaria_won ) 

***************
** FIGURE 2  **
***************

foreach x in mate comu nonincent{
bysort id_anio: egen avg_`x'0=mean(`x') if treat==0
bysort id_anio: egen avg_`x'1=mean(`x') if treat==1
}

preserve
keep avg_* id_anio 
duplicates drop

cd "$results"
twoway (scatter avg_mate1 avg_mate0 id_anio, msymbol(O) mcolor(red black)) (line avg_mate1 avg_mate0 id_anio, lcolor(red black) lpattern(solid dash)), graphregion(color(white)) title("Math", color(black)) ytitle(Internal Grade) ysc(titlegap(2)) legend(order(3 "Mean - 8th graders" 4 "Mean - 9th graders")) xtitle(Year) xlab(2013(1)2015) ylab(12(0.2) 12.6)
graph save  mate_parallel.gph, replace
twoway (scatter avg_comu1 avg_comu0 id_anio, msymbol(O) mcolor(red black)) (line avg_comu1 avg_comu0 id_anio, lcolor(red black) lpattern(solid dash)), graphregion(color(white)) title("Language", color(black)) ytitle(Internal Grade) ysc(titlegap(2)) legend(order(3 "Mean - 8th graders" 4 "Mean - 9th graders")) xtitle(Year) xlab(2013(1)2015) ylab(12.5 (0.2) 12.9)
graph save  comu_parallel.gph, replace
twoway (scatter avg_nonincent1 avg_nonincent0 id_anio, msymbol(O) mcolor(red black)) (line avg_nonincent1 avg_nonincent0 id_anio, lcolor(red black) lpattern(solid dash)), graphregion(color(white)) title("Non-Incentivized Courses", color(black)) ytitle(Internal Grade) ysc(titlegap(2)) legend(order(3 "Mean - 8th graders" 4 "Mean - 9th graders")) xtitle(Year) xlab(2013(1)2015) ylab(13 (0.2) 13.6)
graph save  nonincent_parallel.gph, replace

grc1leg "mate_parallel.gph" "comu_parallel.gph" "nonincent_parallel.gph", graphregion(color(white))
graph export "Figure2.pdf", replace fontface(Times) 

restore

***************
** FIGURE 3  ** 
***************


foreach x in mate comu{
gen coeff_`x'=.
gen low_`x'=.
gen high_`x'=.
gen p_`x'=.
}

foreach x in mate comu {
forvalues y=1/409{
qui areg std_`x' treat_post treat i.id_anio ${controls} if group_be==`y', absorb(school_fe) cluster(school_fe)
replace coeff_`x'=_b[treat_post] if group_be==`y'
replace low_`x'=_b[treat_post] + _se[treat_post]*invt(e(df_r),0.025) if group_be==`y'
replace high_`x'=_b[treat_post] + _se[treat_post]*invt(e(df_r),0.975) if group_be==`y'
replace p_`x'=2*ttail(e(df_r),abs(_b[treat_post]/_se[treat_post])) if group_be==`y'
}
}

gen significant_mate=(p_mate<0.05)
gen significant_comu=(p_comu<0.05)

bysort group: gen f=(_n==1)

gen positive_significant_mate=coeff_mate>0 & significant_mate==1
gen positive_significant_comu=coeff_comu>0 & significant_comu==1

sum significant_* positive_* if f==1

preserve
keep coeff_* low_* high_* p_* group_be
duplicates drop

gen problem=high_comu>2 | low_comu<-2 | high_mate>2 | low_mate<-2

sort coeff_mate
gen n=_n
twoway  (rcap high_mate low_mate n if p_mate>=0.05 & problem==0, lcolor(gs8) lwidth(thin)) ///
(rcap high_mate low_mate n if p_mate<0.05 & problem==0, lcolor(black) lwidth(thin)) ///
(scatter coeff_mate n if p_mate>=0.05 & problem==0, mcolor(gs8) msize(vsmall)) ///
(scatter coeff_mate n if p_mate<0.05  & problem==0, mcolor(black) msize(vsmall)), ytitle (8th Grade x Post Coefficient (Math)) legend(off) xla(, labcolor(bg) tlength(0))
graph save "group_mate.gph", replace

drop n
sort coeff_comu
gen n=_n

twoway  (rcap high_comu low_comu n if  p_comu>=0.05 & problem==0, lcolor(gs8) lwidth(thin)) ///
(rcap high_comu low_comu n if p_comu<0.05 & problem==0, lcolor(black) lwidth(thin)) ///
(scatter coeff_comu n if  p_comu>=0.05 & problem==0, mcolor(gs8) msize(vsmall)) ///
(scatter coeff_comu n if  p_comu<0.05 & problem==0, mcolor(black) msize(vsmall)), ytitle (8th Grade x Post Coefficient (Language)) legend(off) xla(, labcolor(bg) tlength(0)) ///
xtitle(BE groups (ordered by coefficient size))
graph save "group_comu.gph", replace

graph combine "group_mate.gph" "group_comu.gph", rows(2) ysize(10) xsize(12)
graph export "Figure3.pdf", replace fontface(Times) 

restore


**************
** FIGURE 5 **
**************

* Index with Student Characteristics for 8th graders in 2015
gen mother_high_educ=1-mother_low_educ
replace mother_high_educ=. if mother_low_educ==.
gen father_high_educ=1-father_low_educ
replace father_high_educ=. if father_low_educ==.
gen spanish_native=lengua_materna_cast

foreach x in  mother_high_educ father_high_educ spanish_native{
gen aux_`x'=`x'
replace aux_`x'=. if id_anio!=2015
replace aux_`x'=. if treat==0

bysort cod_mod anexo: egen perc_`x'=mean(aux_`x')
drop aux_`x'
}

gen index_stud=perc_mother_high_educ+ perc_father_high_educ+ perc_spanish_native

bysort id_anio cod_mod anexo treat: gen first=(_n==1)
replace first=0 if id_anio!=2015 
replace first=0 if treat==0 


foreach y of numlist 5(5)95{
gen percentile_`y'=.


foreach x of numlist 1(1)409{
_pctile index_stud if first==1 & group==`x', p(`y')
replace percentile_`y'=r(r1) if group==`x'
}
}

gen is_perc_5=index_stud<percentile_5
gen is_perc_10=index_stud>=percentile_5 & index_stud<percentile_10
gen is_perc_15=index_stud>=percentile_10 & index_stud<percentile_15
gen is_perc_20=index_stud>=percentile_15 & index_stud<percentile_20
gen is_perc_25=index_stud>=percentile_20 & index_stud<percentile_25
gen is_perc_30=index_stud>=percentile_25 & index_stud<percentile_30
gen is_perc_35=index_stud>=percentile_30 & index_stud<percentile_35
gen is_perc_40=index_stud>=percentile_35 & index_stud<percentile_40
gen is_perc_45=index_stud>=percentile_40 & index_stud<percentile_45
gen is_perc_50=index_stud>=percentile_45 & index_stud<percentile_50
gen is_perc_55=index_stud>=percentile_50 & index_stud<percentile_55
gen is_perc_60=index_stud>=percentile_55 & index_stud<percentile_60
gen is_perc_65=index_stud>=percentile_60 & index_stud<percentile_65
gen is_perc_70=index_stud>=percentile_65 & index_stud<percentile_70
gen is_perc_75=index_stud>=percentile_70 & index_stud<percentile_75
gen is_perc_80=index_stud>=percentile_75 & index_stud<percentile_80
gen is_perc_85=index_stud>=percentile_80 & index_stud<percentile_85
gen is_perc_90=index_stud>=percentile_85 & index_stud<percentile_90
gen is_perc_95=index_stud>=percentile_90 & index_stud<percentile_95
gen is_perc_100=index_stud>=percentile_95 

foreach y of numlist 5(5)100{
gen treat_post_p`y'=treat_post*is_perc_`y'
gen treat_p`y'=treat*is_perc_`y'
gen post_p`y'=post*is_perc_`y'
}

foreach y of numlist 5(5)100{
label variable treat_post_p`y' "`y'"
}

areg  std_mate   treat_post_p5 treat_post_p10 treat_post_p15 treat_post_p20 treat_post_p25 treat_post_p30 treat_post_p35 treat_post_p40 treat_post_p45 treat_post_p50 treat_post_p55 treat_post_p60 treat_post_p65 treat_post_p70 treat_post_p75 treat_post_p80 treat_post_p85 treat_post_p90 treat_post_p95 treat_post_p100  treat_p5 post_p5 treat_p10 post_p10 treat_p15 post_p15 treat_p20 post_p20 treat_p25 post_p25 treat_p30 post_p30 treat_p35 post_p35 treat_p40 post_p40 treat_p45 post_p45 treat_p50 post_p50 treat_p55 post_p55 treat_p60 post_p60 treat_p65 post_p65 treat_p70 post_p70 treat_p75 post_p75 treat_p80 post_p80 treat_p85 post_p85 treat_p90 post_p90 treat_p95 post_p95 treat_p100 post_p100  i.id_anio ${controls} , absorb(school_fe) cluster(school_fe)
coefplot , keep (treat_post_p5 treat_post_p10 treat_post_p15 treat_post_p20 treat_post_p25 treat_post_p30 treat_post_p35 treat_post_p40 treat_post_p45 treat_post_p50 treat_post_p55 treat_post_p60 treat_post_p65 treat_post_p70 treat_post_p75 treat_post_p80 treat_post_p85 treat_post_p90 treat_post_p95 treat_post_p100) /// 
vertical ytitle (8th Grade x Post Coefficient) xtitle(Within-BE Group Percentile by SES Index) ///
ciopts(lcolor(black) recast(rcap)) yline(0,lcolor(gs12)) msize(small) mcolor(black) legend(off) graphregion(color(white)) nooffsets  
graph save Graph "Percentiles_Math.gph", replace

areg  std_comu  treat_post_p5 treat_post_p10 treat_post_p15 treat_post_p20 treat_post_p25 treat_post_p30 treat_post_p35 treat_post_p40 treat_post_p45 treat_post_p50 treat_post_p55 treat_post_p60 treat_post_p65 treat_post_p70 treat_post_p75 treat_post_p80 treat_post_p85 treat_post_p90 treat_post_p95 treat_post_p100  treat_p5 post_p5 treat_p10 post_p10 treat_p15 post_p15 treat_p20 post_p20 treat_p25 post_p25 treat_p30 post_p30 treat_p35 post_p35 treat_p40 post_p40 treat_p45 post_p45 treat_p50 post_p50 treat_p55 post_p55 treat_p60 post_p60 treat_p65 post_p65 treat_p70 post_p70 treat_p75 post_p75 treat_p80 post_p80 treat_p85 post_p85 treat_p90 post_p90 treat_p95 post_p95 treat_p100 post_p100  i.id_anio ${controls} , absorb(school_fe) cluster(school_fe)
coefplot , keep (treat_post_p5 treat_post_p10 treat_post_p15 treat_post_p20 treat_post_p25 treat_post_p30 treat_post_p35 treat_post_p40 treat_post_p45 treat_post_p50 treat_post_p55 treat_post_p60 treat_post_p65 treat_post_p70 treat_post_p75 treat_post_p80 treat_post_p85 treat_post_p90 treat_post_p95 treat_post_p100) /// 
vertical ytitle (8th Grade x Post Coefficient) xtitle(Within-BE Group Percentile by SES Index) ///
ciopts(lcolor(black) recast(rcap)) yline(0,lcolor(gs12)) msize(small) mcolor(black) legend(off) graphregion(color(white)) nooffsets  
graph save Graph "Percentiles_Comu.gph", replace


graph combine "Percentiles_Math.gph" "Percentiles_Comu.gph" , rows(2) ycommon
graph export "Figure5.pdf", replace fontface(Times) 




****************
** FIGURE A.1 **
****************

preserve
drop if mate==. | comu==.

keep  id_anio cod_mod anexo grado seccion turno comu mate num_secciones_grado avg_num_classes_school_grade school_fe  num_teachers_grado previous_mate previous_comu
drop if id_anio!=2014

keep if grado=="SEGUNDO"


keep if num_secciones_grado==2 & grado=="SEGUNDO" // Keep schools that have 2 8th grade groups in 2014
gen aux=1
bysort cod_mod anexo  turno seccion: gen f=(_n==1)
bysort cod_mod anexo : gen seccion_id=sum(f)

drop school_fe
egen school_fe=group(cod_mod anexo)


gen p_mate=.
gen p_comu=.
gen mean_difference_mate=.
gen mean_difference_comu=.

*P-value for difference in means, and average difference in means
forvalues x=1/1457{
foreach y in mate comu{
ttest `y' if school_fe==`x', by(seccion_id) unequal
replace p_`y'= r(p)  if school_fe==`x'
replace mean_difference_`y'=r(mu_1)-r(mu_2) if school_fe==`x'
}
}

gen significant_mate=(p_mate<0.1)
gen significant_comu=(p_comu<0.1)
replace mean_difference_mate=mean_difference_mate*-1 if mean_difference_mate<0
replace mean_difference_comu=mean_difference_comu*-1 if mean_difference_comu<0


*P-value for difference in SD, and average difference in SD
gen p_sd_mate=.
gen p_sd_comu=.
gen sd_difference_mate=.
gen sd_difference_comu=.

forvalues x=1/1457{
foreach y in mate comu{
sdtest `y' if school_fe==`x', by(seccion_id) 
replace p_sd_`y'= r(p)  if school_fe==`x'
replace sd_difference_`y'=r(sd_1)-r(sd_2) if school_fe==`x'
}
}

foreach y in mate comu{
gen significant_mean_`y'_10=(p_`y'<0.1)
gen significant_sd_`y'_10=(p_sd_`y'<0.1)
gen significant_mean_`y'_1=(p_`y'<0.01)
gen significant_sd_`y'_1=(p_sd_`y'<0.01)
gen significant_mean_`y'_5=(p_`y'<0.05)
gen significant_sd_`y'_5=(p_sd_`y'<0.05)

}

foreach y in mate comu{
foreach x in 1 5 10{
replace significant_mean_`y'_`x'=. if p_`y'==.
replace significant_sd_`y'_`x'=. if p_sd_`y'==.
}
}

replace sd_difference_mate=sd_difference_mate*-1 if sd_difference_mate<0
replace sd_difference_comu=sd_difference_comu*-1 if sd_difference_comu<0


sort  mean_difference_mate
twoway   (scatter  mean_difference_mate p_mate if f_school==1 & p_mate>=0.1, mlcolor(gs10)  mfcolor(white) msize(small)) (scatter  mean_difference_mate p_mate if f_school==1 & p_mate<0.1, mlcolor(red) mfcolor(white) msize(small) ),  xlabel(0(0.1)1) ytitle("Difference in Means") xtitle("P-Value") graphregion(color(white)) legend(off) 
graph save mean_mate.gph, replace
sort  mean_difference_comu
twoway   (scatter  mean_difference_comu p_comu if f_school==1 & p_comu>=0.1, mlcolor(gs10) mfcolor(white) msize(small)) (scatter  mean_difference_comu p_comu if f_school==1 & p_comu<0.1, mlcolor(red) mfcolor(white) msize(small)), xlabel(0(0.1)1) ytitle("Difference in Means") xtitle("P-Value") graphregion(color(white)) legend(off) 
graph save mean_comu.gph, replace

sort  sd_difference_mate
twoway (scatter  sd_difference_mate p_sd_mate if f_school==1 & p_sd_mate>=0.1, mlcolor(gs10) mfcolor(white) msize(small)) (scatter  sd_difference_mate p_sd_mate if f_school==1 & p_sd_mate<0.1, mlcolor(red) mfcolor(white) msize(small))  ,  xlabel(0(0.1)1) ytitle("Difference in SD") xtitle("P-Value") graphregion(color(white)) legend(off) 
graph save sd_mate.gph, replace
sort  sd_difference_comu
twoway  (scatter  sd_difference_comu p_sd_comu if f_school==1 & p_sd_comu>=0.1, mlcolor(gs10) mfcolor(white) msize(small)) (scatter  sd_difference_comu p_sd_comu if f_school==1 & p_sd_comu<0.1, mlcolor(red) mfcolor(white) msize(small)) ,    xlabel(0(0.1)1) ytitle("Difference in SD") xtitle("P-Value") graphregion(color(white)) legend(off) 
graph save sd_comu.gph, replace

graph combine mean_mate.gph mean_comu.gph sd_mate.gph sd_comu.gph , graphregion(color(white)) 
graph export "FigureA.1.pdf", replace fontface(Times) 

restore


****************
** FIGURE A.2 **
****************

preserve
drop if mate==. | comu==.

keep  perc_new_to_grade perc_new_to_school id_anio cod_mod anexo grado seccion turno comu mate  num_secciones_grado school_fe  num_teachers_grado
drop if id_anio==2015

keep if  grado=="SEGUNDO"

drop school_fe
egen school_fe=group(cod_mod anexo)
bysort cod_mod anexo: gen f_school=(_n==1)

gen p_mate=.
gen p_comu=.
gen mean_difference_mate=.
gen mean_difference_comu=.


*P-value for difference in means, and average difference in means
forvalues x=1/6504{
foreach y in mate comu{
ttest `y' if school_fe==`x', by(id_anio) unequal
replace p_`y'= r(p)  if school_fe==`x'
replace mean_difference_`y'=r(mu_1)-r(mu_2) if school_fe==`x'
}
}
forvalues x=6506/7878{
foreach y in mate comu{
ttest `y' if school_fe==`x', by(id_anio) unequal
replace p_`y'= r(p)  if school_fe==`x'
replace mean_difference_`y'=r(mu_1)-r(mu_2) if school_fe==`x'
}
}

replace mean_difference_mate=mean_difference_mate*-1 if mean_difference_mate<0
replace mean_difference_comu=mean_difference_comu*-1 if mean_difference_comu<0


gen p_sd_mate=.
gen p_sd_comu=.
gen sd_difference_mate=.
gen sd_difference_comu=.

*P-value for difference in SD, and average difference in SD
forvalues x=1/6504{
foreach y in mate comu{
sdtest `y' if school_fe==`x', by(id_anio) 
replace p_sd_`y'= r(p)  if school_fe==`x'
replace sd_difference_`y'=r(sd_1)-r(sd_2) if school_fe==`x'
}
}

forvalues x=6506/7878{
foreach y in mate comu{
sdtest `y' if school_fe==`x', by(id_anio) 
replace p_sd_`y'= r(p)  if school_fe==`x'
replace sd_difference_`y'=r(sd_1)-r(sd_2) if school_fe==`x'
}
}

foreach y in mate comu{
gen significant_mean_`y'_10=(p_`y'<0.1)
gen significant_sd_`y'_10=(p_sd_`y'<0.1)

}

foreach y in mate comu{
replace significant_mean_`y'_10=. if p_`y'==.
replace significant_sd_`y'_10=. if p_sd_`y'==.
}

replace sd_difference_mate=sd_difference_mate*-1 if sd_difference_mate<0
replace sd_difference_comu=sd_difference_comu*-1 if sd_difference_comu<0


keep mean_difference_mate mean_difference_comu sd_difference_mate sd_difference_comu p_mate p_comu p_sd_mate p_sd_comu school_fe
duplicates drop 
gen f_school=1

sort  mean_difference_mate
twoway   (scatter  mean_difference_mate p_mate if f_school==1 & p_mate>=0.1, mcolor(gs10) msize(small) mfcolor(white)) (scatter  mean_difference_mate p_mate if f_school==1 & p_mate<0.1, mcolor(red) msize(small) mfcolor(white) mfcolor(white)),  xlabel(0(0.1)1) title("Difference in Means - Math", color(black)) ytitle("Difference in Means") xtitle("P-Value") graphregion(color(white)) legend(off) 
graph save mean_mate_2.gph, replace
sort  mean_difference_comu
twoway   (scatter  mean_difference_comu p_comu if f_school==1 & p_comu>=0.1, mcolor(gs10) msize(small) mfcolor(white)) (scatter  mean_difference_comu p_comu if f_school==1 & p_comu<0.1, mcolor(red) msize(small) mfcolor(white)), xlabel(0(0.1)1) title("Difference in Means - Language", color(black)) ytitle("Difference in Means") xtitle("P-Value") graphregion(color(white)) legend(off) 
graph save mean_comu_2.gph, replace


sort  sd_difference_mate
twoway (scatter  sd_difference_mate p_sd_mate if f_school==1 & p_sd_mate>=0.1, mcolor(gs10) msize(small) mfcolor(white)) (scatter  sd_difference_mate p_sd_mate if f_school==1 & p_sd_mate<0.1, mcolor(red) msize(small) mfcolor(white))  ,  xlabel(0(0.1)1) title("Difference in Standard Deviations - Math", color(black)) ytitle("Difference in SD") xtitle("P-Value") graphregion(color(white)) legend(off) 
graph save sd_mate_2.gph, replace
sort  sd_difference_comu
twoway  (scatter  sd_difference_comu p_sd_comu if f_school==1 & p_sd_comu>=0.1, mcolor(gs10) msize(small) mfcolor(white)) (scatter  sd_difference_comu p_sd_comu if f_school==1 & p_sd_comu<0.1, mcolor(red) msize(small) mfcolor(white)) ,    xlabel(0(0.1)1) title("Difference in Standard Deviations - Language", color(black)) ytitle("Difference in SD") xtitle("P-Value") graphregion(color(white)) legend(off) 
graph save sd_comu_2.gph, replace

graph combine mean_mate_2.gph mean_comu_2.gph sd_mate_2.gph sd_comu_2.gph , graphregion(color(white)) 
graph export "FigureA.2.pdf", replace fontface(Times) 

restore


***************
** TABLA A.1 **
***************

*Have to create a new dataset with private school students as the control group
clear
use "$data\secundaria_23022016.dta", clear
keep if  grado=="SEGUNDO" 

*There's 70 obs in 2014 that are shifted, adjust them
gen space=" "
egen seccion2=concat(seccion space turno) if  col43!=""
replace seccion=seccion2  if  col43!=""
drop seccion2 space
tostring  arte- pfrrhh, replace
replace turno=arte if col43!="" 
replace arte=cta if col43!=""
replace cta=comu  if col43!=""
replace comu=efis if col43!=""
replace efis=etra  if col43!=""
replace etra=erel if col43!=""
replace erel=fcc  if col43!=""
replace fcc=hge  if col43!=""
replace hge=ingl  if col43!=""
replace ingl=mate  if col43!=""
replace mate=pfrrhh if col43!=""
replace pfrrhh=sit_final if col43!=""
destring  arte- pfrrhh, replace
replace sit_final=mot_ret	 if col43!=""
replace mot_ret=fecha_nacimiento if col43!=""
replace fecha_nacimiento=sexo if col43!=""
replace sexo=id_pais if col43!=""
replace id_pais=lugar_nacimiento   if col43!=""
replace lugar_nacimiento=dpto_prov_dist   if col43!=""
replace dpto_prov_dist=lengua_materna   if col43!=""
replace lengua_materna=segunda_lengua   if col43!=""
replace segunda_lengua=tipo_discapacidad   if col43!=""
replace tipo_discapacidad=instruccion_madre  if col43!=""
replace instruccion_madre=lugar_residencia  if col43!=""
tostring cod_mod_traslado, replace
replace lugar_residencia=cod_mod_traslado   if col43!=""
replace  cod_mod_traslado=situacion_matricula  if col43!=""
destring cod_mod_traslado, replace
replace  situacion_matricula=trabaja  if col43!=""
replace trabaja=horas_semanales_trabajo   if col43!=""
replace horas_semanales_trabajo=padre_vive    if col43!=""
replace  padre_vive=madre_vive   if col43!=""
replace  madre_vive=fecha_nacimiento_padre   if col43!=""
replace  fecha_nacimiento_padre=nivel_instruccion_padre  if col43!=""
replace  nivel_instruccion_padre=vive_con_estudiante_padre   if col43!=""
replace  vive_con_estudiante_padre=fecha_nacimiento_madre   if col43!=""
replace  fecha_nacimiento_madre= nivel_instruccion_madre    if col43!=""
replace  nivel_instruccion_madre=vive_con_estudiante_madre   if col43!=""
replace  vive_con_estudiante_madre=col43   if col43!=""
drop col43

destring  arte- pfrrhh, replace
destring fcc, replace force


merge m:1 id_anio anexo cod_mod using "$data\IIEE_secundaria.dta"
keep if _m==3
drop _m

*Drop some variables that we will not use later on and that vary within student-year-school but shouldn't, leading to duplicate observations
drop   dsc_caracteristica   lugar_nacimiento dpto_prov_dist segunda_lengua trabaja  horas_semanales_trabajo fecha_nacimiento_padre fecha_nacimiento_madre nom_dre nom_ugel ///
departamento provincia distrito cen_pob fecha_inicio_anio fecha_fin_anio cod_dre dsc_modalidad lugar_residencia nivel   
*Remove duplicate observations
duplicates drop


*** Deal with students with more than one observation in a given year
{
gen aux=1
bysort id_persona id_anio: egen num_obs=sum(aux)

*Do they have grades in all observations?
gen has_grades=(mate!=. | comu!=.)
bysort id_persona id_anio: egen num_has_grades=sum(has_grades)

*If they have only one observation with grades, the other ones usually correspond to schools they transferred from after the school year started. Keep the only observation with grades
drop if has_grades==0 & num_has_grades==1 & num_obs>1 & sit_final=="Trasladado"

drop num_obs
bysort id_persona id_anio: egen num_obs=sum(aux)
*Very few left now (0.09%). Mostly kids who have grades in more than one observation. Drop observations where the kid was transferred, for kids who have at least another observation with grades in which they were not transferred
gen transferred=sit_final=="Trasladado"
*Do the non-transferred obs have grades?
gen aux2=transferred==0 & has_grades==1
bysort id_persona id_anio: egen num_grades_nontrans=sum(aux2)
drop if num_obs>1 & transferred==1 & num_grades_nontrans>0
drop transferred num_grades_nontrans num_obs aux2 num_has_grades

bysort id_persona id_anio: egen num_obs=sum(aux)
bysort id_persona id_anio: egen num_has_grades=sum(has_grades)

*Very few left now (0.06%). It's around 2204 kids, most have grades in 2 observations. How many of them are in the same school?
drop aux
bysort id_persona id_anio cod_mod anexo: gen aux=(_n==1) 
bysort id_persona id_anio: egen num_schools=sum(aux)
*79% of kids with more than one obs are in the same school in all obs. For those who have more than one obs in the same school, it's just a covariate that differs >> I keep the first
tab num_schools if num_obs>1
gsort id_anio id_persona -has_grades
by id_anio id_persona: gen first_person=(_n==1)
drop if first_person==0 & num_schools==1 & num_obs>1

*There's also a few cases of kids with 2 schools, but 2 observations in one of the schools where just a covariate differs. In the duplicate school, keep one
drop aux first_person
gen aux=1
bysort id_persona id_anio cod_mod anexo: egen num_same_school=sum(aux)
gsort id_anio id_persona cod_mod  anexo -has_grades
by id_anio id_persona cod_mod anexo: gen first_person=(_n==1)
drop if first_person==0 & num_same_school==2

drop first_person aux num_obs num_same_school
gen aux=1
bysort id_persona id_anio: egen num_obs=sum(aux)

*Now it's 466 students with 2 observations, all of which have data in two schools
*Note: cod_mod_translado is the code of the school the student came from (the origin school). So if a student has grades in two different schools, but was transfered from one of them to another, keep the destination school
gen aux2=cod_mod_traslado!=.
bysort id_anio id_persona: egen was_transferred=max(aux2)
bysort id_anio id_persona: egen num_transfers=sum(aux2)
*Drop the origin school
drop if num_obs>1 & num_transfers==1 & cod_mod_traslado==.

*If they have two transfers, one with grades and another without, keep the one with grades
drop num_has_grades
bysort id_persona id_anio: egen num_has_grades=sum(has_grades)
drop if num_obs>1 & num_transfers==2 & has_grades==0 & num_has_grades==1 

drop num_transfers num_obs 
bysort id_persona id_anio: egen num_obs=sum(aux)
bysort id_anio id_persona: egen num_transfers=sum(aux2)
*Only 154 students with more than one observation. It's 0.00% of the students. Drop one of them
bysort id_persona id_anio: gen first_person=_n==1
drop if first_person==0 & num_obs>1
drop num_schools- first_person
*Drop the student who passed away and therefore has no grades
drop if sit_final=="Fallecidos" & has_grades==0
}


*Individual Controls
gen male=(sexo=="HOMBRE")
replace male=. if sexo==""
gen foreigner=(id_pais!="PE")
replace foreigner=. if id_pais==""
gen lengua_materna_cast=(lengua_materna=="CASTELLANO") 
replace lengua_materna_cast=. if lengua_materna==""
gen discapacidad=(tipo_discapacidad!="")
gen mother_low_educ=(nivel_instruccion_madre=="NINGUNO" | nivel_instruccion_madre=="PRIM.COMP" | nivel_instruccion_madre=="PRIM.INCOM")
replace mother_low_educ=. if  nivel_instruccion_madre==""
gen father_low_educ=(nivel_instruccion_padre=="NINGUNO" | nivel_instruccion_padre=="PRIM.COMP" | nivel_instruccion_padre=="PRIM.INCOM")
replace father_low_educ=. if nivel_instruccion_padre==""
gen father_alive=(padre_vive=="SI" | padre_vive=="")
gen mother_alive=(madre_vive=="SI" | madre_vive=="")
gen lives_father=(vive_con_estudiante_padre=="SI")
gen lives_mother=(vive_con_estudiante_madre=="SI")
gen repetidor=(situacion_matricula=="REPITE")
drop  sexo id_pais lengua_materna tipo_discapacidad  padre_vive madre_vive vive_con_estudiante_padre  vive_con_estudiante_madre  nivel_instruccion_madre  nivel_instruccion_padre

keep id_persona id_anio cod_mod anexo tip_gestion grado seccion turno arte- sit_final fecha_nacimiento cod_ugel male foreigner lengua_materna_cast discapacidad mother_low_educ father_low_educ father_alive mother_alive lives_father lives_mother repetidor

*We only care about schools selected to take ECE (Grupo D) 
merge m:1 cod_mod anexo using "$data\ECE_secondary_schools.dta"
drop if _merge==2
gen ece_school=(_m==3)
drop _m

*Now look at whether private schools were eligible for ECE in 2015
merge m:1 cod_mod anexo using "$data\ECE_secondary_private.dta"
drop if _merge==2
replace ece_school=1 if _m==3
drop _m

*Only keep schools eligible for ECE
keep if ece_school==1

*Only keep schools that have all 3 years
bysort id_anio cod_mod anexo: gen aux=_n==1
bysort cod_mod anexo: egen num_years=sum(aux)
keep if num_years==3
drop aux num_years

gen std_mate=.
gen std_comu=.

*Standardize grades (z-score) by year 
foreach x in mate comu{
foreach z in 2013 2014 2015{
qui sum `x' if   id_anio==`z'
replace std_`x'=(`x'-r(mean))/r(sd)  if  id_anio==`z'
}	
}

label variable std_mate "Math Grade (z-score)"
label variable std_comu "Language Grade (z-score)"


foreach x in arte cta etra efis erel fcc hge ingl pfrrhh {
gen std_`x'=.
foreach z in 2013 2014 2015{
qui sum `x' if   id_anio==`z'
replace std_`x'=(`x'-r(mean))/r(sd)  if  id_anio==`z'
}	
}

egen nonincent=rowmean(arte cta etra efis erel fcc hge ingl pfrrhh)

egen std_nonincent=rowmean(std_arte std_cta std_etra std_efis std_erel std_fcc std_hge std_ingl std_pfrrhh)
label variable std_nonincent "Non-Incentivized Course Grades (z-score)"

gen treat=(tip_gestion!="PRIVADA"==1)
gen post=(id_anio==2015)
gen year_2014=(id_anio==2014)
gen treat_post=treat*post
gen treat_year_2014=treat*post2
egen school_fe=group(cod_mod anexo)

global controls  repetidor male foreigner lengua_materna_cast discapacidad father_alive mother_alive lives_father lives_mother

foreach x in mate comu nonincent {
areg std_`x' treat_post treat_year_2014 treat i.id_anio ${controls}, absorb(school_fe) cluster(school_fe)
eststo `x'_priv_parallel

}

label variable treat_post "Public x Post"
label variable treat_post2 "Public x 2014"
label variable repetidor "Repeated last year"   
label variable male "Male"
label variable foreigner "Foreigner"
label variable lengua_materna_cast "Spanish is native tongue"
label variable discapacidad "Has a disability"
label variable father_alive "Father is alive"
label variable mother_alive  "Mother is alive"
label variable  lives_father "Father lives in HH"
label variable  lives_mother "Mother lives in HH"


esttab  mate_priv_parallel comu_priv_parallel nonincent_priv_parallel using "TableA.1.tex", replace b(3) se(3) star(* 0.10 ** 0.05 *** 0.01)  nonotes l ///
mtitles ("Math" "Language" "Non-Incentivized Courses") scalars("N Observations" "r2 R$^2$" ) sfmt(%12.0fc %12.3fc )   keep(treat_post treat_year_2014 repetidor male foreigner lengua_materna_cast discapacidad  father_alive mother_alive lives_father lives_mother) 


***************
** TABLE A.6 **
***************

*Panel A: MC ECE
use "$data\MC ECE\BD_2S_ECE2015_WEB.dta", clear // Data available at http://umc.minedu.gob.pe/wp-content/uploads/2016/03/2S_ECE2015.7z
gen comu= M500_L
gen mate= M500_M

gen male= sexo_estu==0

foreach x in mate comu{
bysort IDD: egen mean_`x'=mean(`x')
bysort IDD: egen sd_`x'=sd(`x')
gen std_`x'=(`x'-mean_`x')/sd_`x'
}

gen low_mate=( grupo_ECE_2S_2015_M==1)
gen low_comu=( grupo_ECE_2S_2015_C==1)

foreach x in mate comu{
areg std_`x' male  ISE if  Gestion==1, absorb( IDD) cluster(IDD)
eststo std_`x'
areg low_`x' male  ISE if  Gestion==1, absorb( IDD) cluster(IDD)
eststo low_`x'

}

label variable male "Male"
label variable ISE "Socioeconomic status index"

esttab  std_mate low_mate std_comu low_comu using "TableA.6_a.tex", replace b(3) se(3) star(* 0.10 ** 0.05 *** 0.01)  noconstant nonotes l mgroups("Math" "Language" , pattern(1 0  1 0  ) prefix(\multicolumn{@span}{c}{) suffix(}) span erepeat(\cmidrule(lr){@span})) mtitles("Grade (z-score)" "Low Achievement"  "Grade (z-score)" "Low Achievement" ) scalars("N Observations" "r2 R$^2$"") sfmt(%12.0fc %12.3fc) order(ISE male)

*Panel B: Internal Grades
use "$data\internal_grades_2013_2015.dta", clear

preserve
keep if grado=="SEGUNDO"
keep if id_anio==2015

gen desaprobo_mate=1-aprobo_mate
gen desaprobo_comu=1-aprobo_comu

gen mother_high_educ=1-mother_low_educ
gen father_high_educ=1-father_low_educ
replace mother_high_educ=. if mother_low_educ==.
replace father_high_educ=. if father_low_educ==.

drop std_mate std_comu

foreach x in mate comu{
bysort school_fe: egen mean_`x'=mean(`x') 
bysort school_fe: egen sd_`x'=sd(`x')
gen std_`x'=(`x'-mean_`x')/sd_`x'
drop mean_`x' sd_`x'
}

foreach x in mate comu{
areg  std_`x'  lengua_materna_cast mother_high_educ father_high_educ male   , absorb(school_fe) cluster(school_fe)  
eststo `x'_1
areg  desaprobo_`x'  lengua_materna_cast mother_high_educ father_high_educ male   , absorb(school_fe) cluster(school_fe)  
eststo `x'_2
}

label variable lengua_materna_cast "Spanish is native tongue"
label variable mother_high_educ "Mother has high education"
label variable father_high_educ "Father has high education"
label variable male "Male"

esttab mate_1 mate_2 comu_1 comu_2  using "TableA.6_b.tex", replace b(3) se(3) star(* 0.10 ** 0.05 *** 0.01)  noconstant nonotes l mgroups("Math" "Language" , pattern(1 0  1 0  ) prefix(\multicolumn{@span}{c}{) suffix(}) span erepeat(\cmidrule(lr){@span})) mtitles("Grade (z-score)" "Low Achievement"  "Grade (z-score)" "Low Achievement" ) scalars("N Observations" "r2 R$^2$") sfmt(%12.0fc %12.3fc) 

restore


***************
** TABLE A.4 **
***************

use "$data\secundaria_23022016.dta", clear
keep if grado=="TERCERO" | grado=="PRIMERO"

*There's 70 obs in 2014 that are shifted, adjust them
gen space=" "
egen seccion2=concat(seccion space turno) if  col43!=""
replace seccion=seccion2  if  col43!=""
drop seccion2 space
tostring  arte- pfrrhh, replace
replace turno=arte if col43!="" 
replace arte=cta if col43!=""
replace cta=comu  if col43!=""
replace comu=efis if col43!=""
replace efis=etra  if col43!=""
replace etra=erel if col43!=""
replace erel=fcc  if col43!=""
replace fcc=hge  if col43!=""
replace hge=ingl  if col43!=""
replace ingl=mate  if col43!=""
replace mate=pfrrhh if col43!=""
replace pfrrhh=sit_final if col43!=""
destring  arte- pfrrhh, replace
replace sit_final=mot_ret	 if col43!=""
replace mot_ret=fecha_nacimiento if col43!=""
replace fecha_nacimiento=sexo if col43!=""
replace sexo=id_pais if col43!=""
replace id_pais=lugar_nacimiento   if col43!=""
replace lugar_nacimiento=dpto_prov_dist   if col43!=""
replace dpto_prov_dist=lengua_materna   if col43!=""
replace lengua_materna=segunda_lengua   if col43!=""
replace segunda_lengua=tipo_discapacidad   if col43!=""
replace tipo_discapacidad=instruccion_madre  if col43!=""
replace instruccion_madre=lugar_residencia  if col43!=""
tostring cod_mod_traslado, replace
replace lugar_residencia=cod_mod_traslado   if col43!=""
replace  cod_mod_traslado=situacion_matricula  if col43!=""
destring cod_mod_traslado, replace
replace  situacion_matricula=trabaja  if col43!=""
replace trabaja=horas_semanales_trabajo   if col43!=""
replace horas_semanales_trabajo=padre_vive    if col43!=""
replace  padre_vive=madre_vive   if col43!=""
replace  madre_vive=fecha_nacimiento_padre   if col43!=""
replace  fecha_nacimiento_padre=nivel_instruccion_padre  if col43!=""
replace  nivel_instruccion_padre=vive_con_estudiante_padre   if col43!=""
replace  vive_con_estudiante_padre=fecha_nacimiento_madre   if col43!=""
replace  fecha_nacimiento_madre= nivel_instruccion_madre    if col43!=""
replace  nivel_instruccion_madre=vive_con_estudiante_madre   if col43!=""
replace  vive_con_estudiante_madre=col43   if col43!=""
drop col43

destring  arte- pfrrhh, replace
destring fcc, replace force

*Merge with data from secondary schools
merge m:1 id_anio anexo cod_mod using  "$data\IIEE_secundaria.dta" 

keep if _m==3 
drop _merge
drop if tip_gestion=="PRIVADA"

*Drop some variables that we will not use later on and that vary within student-year-school but shouldn't, leading to duplicate observations
drop   dsc_caracteristica   lugar_nacimiento dpto_prov_dist segunda_lengua trabaja  horas_semanales_trabajo fecha_nacimiento_padre fecha_nacimiento_madre nom_dre nom_ugel ///
departamento provincia distrito cen_pob fecha_inicio_anio fecha_fin_anio cod_dre dsc_modalidad lugar_residencia nivel   
*Remove duplicate observations
duplicates drop

*** Deal with students with more than one observation in a given year
{
gen aux=1
bysort id_persona id_anio: egen num_obs=sum(aux)

*Do they have grades in all observations?
gen has_grades=(mate!=. | comu!=.)
bysort id_persona id_anio: egen num_has_grades=sum(has_grades)

*If they have only one observation with grades, the other ones usually correspond to schools they transferred from after the school year started. Keep the only observation with grades
drop if has_grades==0 & num_has_grades==1 & num_obs>1 & sit_final=="Trasladado"

drop num_obs
bysort id_persona id_anio: egen num_obs=sum(aux)
*Very few left now (0.09%). Mostly kids who have grades in more than one observation. Drop observations where the kid was transferred, for kids who have at least another observation with grades in which they were not transferred
gen transferred=sit_final=="Trasladado"
*Do the non-transferred obs have grades?
gen aux2=transferred==0 & has_grades==1
bysort id_persona id_anio: egen num_grades_nontrans=sum(aux2)
drop if num_obs>1 & transferred==1 & num_grades_nontrans>0
drop transferred num_grades_nontrans num_obs aux2 num_has_grades

bysort id_persona id_anio: egen num_obs=sum(aux)
bysort id_persona id_anio: egen num_has_grades=sum(has_grades)

*Very few left now (0.06%). It's around 2204 kids, most have grades in 2 observations. How many of them are in the same school?
drop aux
bysort id_persona id_anio cod_mod anexo: gen aux=(_n==1) 
bysort id_persona id_anio: egen num_schools=sum(aux)
*79% of kids with more than one obs are in the same school in all obs. For those who have more than one obs in the same school, it's just a covariate that differs >> I keep the first
tab num_schools if num_obs>1
gsort id_anio id_persona -has_grades
by id_anio id_persona: gen first_person=(_n==1)
drop if first_person==0 & num_schools==1 & num_obs>1

*There's also a few cases of kids with 2 schools, but 2 observations in one of the schools where just a covariate differs. In the duplicate school, keep one
drop aux first_person
gen aux=1
bysort id_persona id_anio cod_mod anexo: egen num_same_school=sum(aux)
gsort id_anio id_persona cod_mod  anexo -has_grades
by id_anio id_persona cod_mod anexo: gen first_person=(_n==1)
drop if first_person==0 & num_same_school==2

drop first_person aux num_obs num_same_school
gen aux=1
bysort id_persona id_anio: egen num_obs=sum(aux)

*Now it's 466 students with 2 observations, all of which have data in two schools
*Note: cod_mod_translado is the code of the school the student came from (the origin school). So if a student has grades in two different schools, but was transfered from one of them to another, keep the destination school
gen aux2=cod_mod_traslado!=.
bysort id_anio id_persona: egen was_transferred=max(aux2)
bysort id_anio id_persona: egen num_transfers=sum(aux2)
*Drop the origin school
drop if num_obs>1 & num_transfers==1 & cod_mod_traslado==.

*If they have two transfers, one with grades and another without, keep the one with grades
drop num_has_grades
bysort id_persona id_anio: egen num_has_grades=sum(has_grades)
drop if num_obs>1 & num_transfers==2 & has_grades==0 & num_has_grades==1 

drop num_transfers num_obs 
bysort id_persona id_anio: egen num_obs=sum(aux)
bysort id_anio id_persona: egen num_transfers=sum(aux2)
*Only 154 students with more than one observation. It's 0.00% of the students. Drop one of them
bysort id_persona id_anio: gen first_person=_n==1
drop if first_person==0 & num_obs>1
drop num_schools- first_person
*Drop the student who passed away and therefore has no grades
drop if sit_final=="Fallecidos" & has_grades==0
}

gen dropped_out=has_grades==0
drop has_grades


*Individual Controls
gen male=(sexo=="HOMBRE")
replace male=. if sexo==""
gen foreigner=(id_pais!="PE")
replace foreigner=. if id_pais==""
gen lengua_materna_cast=(lengua_materna=="CASTELLANO") 
replace lengua_materna_cast=. if lengua_materna==""
gen discapacidad=(tipo_discapacidad!="")
gen mother_low_educ=(nivel_instruccion_madre=="NINGUNO" | nivel_instruccion_madre=="PRIM.COMP" | nivel_instruccion_madre=="PRIM.INCOM")
replace mother_low_educ=. if  nivel_instruccion_madre==""
gen father_low_educ=(nivel_instruccion_padre=="NINGUNO" | nivel_instruccion_padre=="PRIM.COMP" | nivel_instruccion_padre=="PRIM.INCOM")
replace father_low_educ=. if nivel_instruccion_padre==""
gen father_alive=(padre_vive=="SI" | padre_vive=="")
gen mother_alive=(madre_vive=="SI" | madre_vive=="")
gen lives_father=(vive_con_estudiante_padre=="SI")
gen lives_mother=(vive_con_estudiante_madre=="SI")
gen repetidor=(situacion_matricula=="REPITE")
drop  sexo id_pais lengua_materna tipo_discapacidad  padre_vive madre_vive vive_con_estudiante_padre  vive_con_estudiante_madre  nivel_instruccion_madre  nivel_instruccion_padre

keep id_persona id_anio cod_mod anexo tip_gestion grado seccion turno arte- sit_final fecha_nacimiento cod_ugel dropped_out  male foreigner lengua_materna_cast ///
discapacidad mother_low_educ father_low_educ father_alive mother_alive lives_father lives_mother repetidor


merge m:1 cod_mod anexo using "$data\BE_ECE_Schools.dta"
keep if _m==3
drop _merge

*Only keep schools that have both grades in all 3 years
bysort id_anio cod_mod anexo grado: gen aux=_n==1
bysort cod_mod anexo: egen num_grades_years=sum(aux)
keep if num_grades_years==6
drop aux num_grades_years

gen std_mate=.
gen std_comu=.

*Standardize grades (z-score) by year 
foreach x in mate comu{
foreach z in 2013 2014 2015{
qui sum `x' if   id_anio==`z'
replace std_`x'=(`x'-r(mean))/r(sd)  if  id_anio==`z'
}	
}

label variable std_mate "Math Grade (z-score)"
label variable std_comu "Language Grade (z-score)"


foreach x in arte cta etra efis erel fcc hge ingl pfrrhh {
gen std_`x'=.
foreach z in 2013 2014 2015{
qui sum `x' if   id_anio==`z'
replace std_`x'=(`x'-r(mean))/r(sd)  if  id_anio==`z'
}	
}

egen nonincent=rowmean(arte cta etra efis erel fcc hge ingl pfrrhh)

egen std_nonincent=rowmean(std_arte std_cta std_etra std_efis std_erel std_fcc std_hge std_ingl std_pfrrhh)
label variable std_nonincent "Non-Incentivized Course Grades (z-score)"

egen gpa=rowmean(arte cta comu efis etra erel fcc hge ingl mate pfrrhh)

gen treat=(grado=="PRIMERO")
gen post=(id_anio==2015)
gen treat_post=treat*post

egen school_fe=group(cod_mod anexo)

foreach x in arte cta comu efis etra erel fcc hge ingl mate pfrrhh{
gen failed_`x'=`x'<11 & `x'!=.
}

egen num_failed=rowtotal(failed_arte failed_cta failed_comu failed_efis failed_etra failed_erel failed_fcc failed_hge failed_ingl failed_mate failed_pfrrhh)

global controls  repetidor male foreigner lengua_materna_cast discapacidad father_alive mother_alive lives_father lives_mother

gen passed_year=sit_final=="Aprobado"


gen automatically_retained=num_failed==4
gen possibly_retained=num_failed==2 | num_failed==3


areg std_mate treat_post  treat i.id_anio ${controls}, absorb(school_fe) cluster(school_fe)
eststo mate
sum std_mate
estadd scalar mean_dep=r(mean)

areg std_comu treat_post treat i.id_anio ${controls}, absorb(school_fe) cluster(school_fe)
eststo comu
sum std_comu
estadd scalar mean_dep=r(mean)

areg  automatically_retained treat_post  treat i.id_anio ${controls} if mate!=. & comu!=., absorb(school_fe) cluster(school_fe)
eststo automatically_retained
sum automatically_retained if mate!=. & comu!=.
estadd scalar mean_dep=r(mean)

areg possibly_retained treat_post treat i.id_anio ${controls} if mate!=. & comu!=., absorb(school_fe) cluster(school_fe)
eststo possibly_retained
sum possibly_retained if mate!=. & comu!=.
estadd scalar mean_dep=r(mean)

label variable treat "7th Grade"
label variable treat_post "7th Grade x Post"

esttab mate comu dropped_out2 transferred_out automatically_retained possibly_retained using "TableA.4.tex", replace  b(3) se(3) star(* 0.10 ** 0.05 *** 0.01) ///
nonotes l mgroups("Internal Grades" "Leaving School during Year" "Retention" , pattern(1 0 1 0  1 0 ) prefix(\multicolumn{@span}{c}{) suffix(}) span erepeat(\cmidrule(lr){@span})) ///
mtitles ("Math" "Language" "Dropout" "Transfer" "Automatically Retained" "Possibly Retained" ) ///
scalars("N Observations" "r2 R$^2$ " "mean_dep Dependent Variable Mean"  "yearfe Year FE" "schoolfe School FE" "individ Individual Controls") sfmt(%12.0fc %12.3fc %12.3fc %#s %#s %#s)   keep(treat_post treat repetidor male foreigner lengua_materna_cast discapacidad  father_alive mother_alive lives_father lives_mother) 



